Dataset & Paper:

The Breast Cancer Wisconsin (Diagnostic) dataset is a renowned collection of data used extensively in machine learning and medical research. Originating from digitized images of fine needle aspirates (FNA) of breast masses, this dataset facilitates the analysis of cell nuclei characteristics to aid in the diagnosis of breast cancer. In this article, we delve into the attributes, statistics, and significance of this dataset.

A new classifier for breast cancer detection based on Naïve Bayesian

Libraries

library(tidyverse)
library(e1071)
library(here)
library(ggcorrplot)
library(ggpubr)
library(janitor)
library(plotly)
options(scipen = 999)

Read in Data

breast_cancer_data <- read_csv(here("data/wisconsin_breast_cancer_data.csv"))

Correlation Matrix

Let’s test the assumption that are features are independent from each other

Drop columns that we don’t need in the correlation matrix

breast_cancer_data_cor <- breast_cancer_data %>% 
  select(-id, -diagnosis, -...33)

Create a matrix from the dataframe

breast_cancer_matrix <- as.matrix(breast_cancer_data_cor)

Calculate the correlations using cor()

breast_cancer_correlations <- cor(breast_cancer_matrix)

Plot the correlations

ggcorrplot(breast_cancer_correlations, type = "lower", lab = TRUE, lab_size = 1)

visualizing correlations between two features

ggplot(breast_cancer_data, aes(x = radius_mean, y = perimeter_mean)) +
  geom_point() +
  geom_smooth() +
  stat_cor()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Create a Naive Bayes Model

breast_cancer_mod <- naiveBayes(diagnosis ~., data = breast_cancer_data)
breast_cancer_mod
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##         B         M 
## 0.6267606 0.3732394 
## 
## Conditional probabilities:
##    id
## Y       [,1]      [,2]
##   B 26618125 116895515
##   M 36818050 137896550
## 
##    radius_mean
## Y       [,1]     [,2]
##   B 12.15885 1.767710
##   M 17.46283 3.203971
## 
##    texture_mean
## Y       [,1]     [,2]
##   B 17.89615 3.985221
##   M 21.60491 3.779470
## 
##    perimeter_mean
## Y        [,1]     [,2]
##   B  78.16011 11.71493
##   M 115.36538 21.85465
## 
##    area_mean
## Y       [,1]     [,2]
##   B 463.5817 133.6395
##   M 978.3764 367.9380
## 
##    smoothness_mean
## Y         [,1]       [,2]
##   B 0.09258958 0.01329740
##   M 0.10289849 0.01260824
## 
##    compactness_mean
## Y         [,1]       [,2]
##   B 0.08018705 0.03374184
##   M 0.14518778 0.05398750
## 
##    concavity_mean
## Y        [,1]       [,2]
##   B 0.0461870 0.04343437
##   M 0.1607747 0.07501933
## 
##    concave points_mean
## Y         [,1]       [,2]
##   B 0.02578965 0.01587242
##   M 0.08799000 0.03437391
## 
##    symmetry_mean
## Y        [,1]       [,2]
##   B 0.1742295 0.02482803
##   M 0.1929090 0.02763809
## 
##    fractal_dimension_mean
## Y         [,1]        [,2]
##   B 0.06287871 0.006753448
##   M 0.06268009 0.007573315
## 
##    radius_se
## Y        [,1]      [,2]
##   B 0.2837969 0.1125986
##   M 0.6090825 0.3450386
## 
##    texture_se
## Y       [,1]      [,2]
##   B 1.219797 0.5899058
##   M 1.210915 0.4831781
## 
##    perimeter_se
## Y       [,1]      [,2]
##   B 1.998783 0.7717058
##   M 4.323929 2.5685457
## 
##    area_se
## Y       [,1]     [,2]
##   B 21.14072  8.85529
##   M 72.67241 61.35527
## 
##    smoothness_se
## Y          [,1]        [,2]
##   B 0.007195921 0.003064917
##   M 0.006780094 0.002890430
## 
##    compactness_se
## Y         [,1]       [,2]
##   B 0.02148538 0.01635023
##   M 0.03228117 0.01838719
## 
##    concavity_se
## Y         [,1]       [,2]
##   B 0.02606976 0.03293560
##   M 0.04182401 0.02160343
## 
##    concave points_se
## Y          [,1]        [,2]
##   B 0.009885343 0.005692600
##   M 0.015060472 0.005517362
## 
##    symmetry_se
## Y         [,1]        [,2]
##   B 0.02056646 0.007000697
##   M 0.02047240 0.010064888
## 
##    fractal_dimension_se
## Y          [,1]        [,2]
##   B 0.003638447 0.002942005
##   M 0.004062406 0.002041498
## 
##    radius_worst
## Y       [,1]     [,2]
##   B 13.39082 1.973166
##   M 21.13481 4.283569
## 
##    texture_worst
## Y       [,1]     [,2]
##   B 23.49581 5.489610
##   M 29.31821 5.434804
## 
##    perimeter_worst
## Y        [,1]     [,2]
##   B  87.08416 13.46504
##   M 141.37033 29.45706
## 
##    area_worst
## Y        [,1]     [,2]
##   B  559.7149 163.1035
##   M 1422.2863 597.9677
## 
##    smoothness_worst
## Y        [,1]       [,2]
##   B 0.1250578 0.01995512
##   M 0.1448452 0.02186983
## 
##    compactness_worst
## Y        [,1]       [,2]
##   B 0.1830047 0.09209558
##   M 0.3748241 0.17037198
## 
##    concavity_worst
## Y        [,1]      [,2]
##   B 0.1667047 0.1402874
##   M 0.4506056 0.1815067
## 
##    concave points_worst
## Y         [,1]       [,2]
##   B 0.07465346 0.03562873
##   M 0.18223731 0.04630779
## 
##    symmetry_worst
## Y        [,1]       [,2]
##   B 0.2701986 0.04179392
##   M 0.3234679 0.07468496
## 
##    fractal_dimension_worst
## Y         [,1]       [,2]
##   B 0.07946750 0.01381510
##   M 0.09152995 0.02155289
## 
##    ...33
## Y   FALSE TRUE
##   B           
##   M

predict diagnosis in breast cancer dataset

breast_cancer_data <- breast_cancer_data %>% 
  mutate(predicted_diagnosis = predict(breast_cancer_mod, newdata = .))

create the confusion matrix

breast_cancer_data %>% 
  tabyl(diagnosis, predicted_diagnosis) %>% 
  adorn_percentages("row") %>% 
  adorn_pct_formatting(digits = 2) %>%
  adorn_ns
##  diagnosis            B            M
##          B 96.07% (342)  3.93%  (14)
##          M 10.38%  (22) 89.62% (190)

Assuming normal distributions

area_mean_density <- ggplot(breast_cancer_data, aes(x = area_mean, fill = diagnosis)) +
  geom_density(alpha = 0.7) +
  theme_minimal() +
  scale_fill_viridis_d(option = "magma")

ggplotly(area_mean_density)
# Parameters for the first normal distribution
mean1 <- 978.3764
sd1 <- 367.9380

# Parameters for the second normal distribution
mean2 <- 463.5817
sd2 <- 133.6395

# Create a data frame for both normal distributions
x <- seq(min(mean1 - 4*sd1, mean2 - 4*sd2), max(mean1 + 4*sd1, mean2 + 4*sd2), length.out = 1000)
y1 <- dnorm(x, mean = mean1, sd = sd1)
y2 <- dnorm(x, mean = mean2, sd = sd2)

# Round values to 5 decimal places for tooltips
normal_data <- data.frame(
  x = x, 
  y1 = round(y1, 5), 
  y2 = round(y2, 5)
)

# Create the ggplot with both normal distributions
normal_dist_plot <- ggplot(normal_data, aes(x = x)) +
  geom_line(aes(y = y1, color = "Distribution 1")) +
  geom_line(aes(y = y2, color = "Distribution 2")) +
  labs(title = "Assumed Normal Distributions", x = "area mean", y = "Density") +
  scale_color_manual(values = c("Distribution 1" = "gold", "Distribution 2" = "grey")) +
  theme_minimal() +
  xlim(143, 2500)

# Make the plot interactive and round tooltips to 5 decimal places
ggplotly(normal_dist_plot, tooltip = c("x", "y"))

Assigning Probabilities to Benign and Malignant

breast_cancer_data <- breast_cancer_data %>% 
  mutate(predicted_diagnosis_raw = predict(breast_cancer_mod, newdata = ., type = "raw"))
prediction_hist <- ggplot(breast_cancer_data, aes(x = predicted_diagnosis_raw[, "B"], fill = diagnosis)) +
  geom_histogram(bins = 100) +
  theme_minimal()+
  scale_fill_viridis_d()

ggplotly(prediction_hist)

Investigating False Negatives

breast_cancer_data <- breast_cancer_data %>%
  mutate(confusion_category = case_when(
    diagnosis == "M" & predicted_diagnosis == "M" ~ "True Positive",
    diagnosis == "M" & predicted_diagnosis == "B" ~ "False Negative",
    diagnosis == "B" & predicted_diagnosis == "B" ~ "True Negative",
    diagnosis == "B" & predicted_diagnosis == "M" ~ "False Positive",
    TRUE ~ "Unknown"  # For any unforeseen cases, if needed
  ))
table(breast_cancer_data$confusion_category)
## 
## False Negative False Positive  True Negative  True Positive 
##             22             14            342            190
smoothness_worst_density <- ggplot(breast_cancer_data, aes(x = smoothness_worst, fill = confusion_category)) +
  geom_density(alpha = 0.5) +
  theme_minimal() +
  scale_fill_viridis_d(option = "magma")

ggplotly(smoothness_worst_density)
area_density_plot <- ggplot(breast_cancer_data, aes(x = area_mean, fill = confusion_category)) +
  geom_density(alpha = 0.5) +
  theme_minimal() +
  scale_fill_viridis_d(option = "magma") 

ggplotly(area_density_plot)